{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(10000, 160)"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_x1 = pd.read_csv('../data/train_ssvm_xy.csv')\n",
    "train_x1.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(15000, 160)"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_x2 = pd.read_csv('../data/train_xy.csv')\n",
    "train_x2.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(10000, 159)"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test = pd.read_csv('../data/test_all.csv')\n",
    "test.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(25000, 157)"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_x11 = train_x1.drop(['cust_id','cust_group','y'],axis=1)\n",
    "train_x22 = train_x2.drop(['\\ufeffcust_id','cust_group','y'],axis=1)\n",
    "train_x = pd.concat([train_x11, train_x22])\n",
    "train_x.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(10000, 157)"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_x = test.drop(['cust_id','cust_group'],axis=1)\n",
    "test_x.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(35000, 157)"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x = pd.concat([train_x, test_x])\n",
    "x.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(25000,)"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_y1 = train_x1['y']\n",
    "train_y2 = train_x2['y']\n",
    "Y_train = train_y1.append(train_y2)\n",
    "Y_train.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(35000, 364)\n"
     ]
    }
   ],
   "source": [
    "for i in range(96,158):\n",
    "    col = 'x'+'_'+str(i)\n",
    "    if col in x.columns.values:\n",
    "        dummies_df = pd.get_dummies(x[col]).rename(columns=lambda x: col +'_'+ str(x))\n",
    "        x = pd.concat([x, dummies_df], axis=1)\n",
    "print(x.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(25000, 364)\n",
      "(10000, 364)\n"
     ]
    }
   ],
   "source": [
    "train_X = x[0:25000]\n",
    "test_X = x[25000:35000]\n",
    "print(train_X.shape)\n",
    "print(test_X.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from sklearn.tree import DecisionTreeClassifier\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.ensemble import AdaBoostClassifier\n",
    "from sklearn.ensemble import ExtraTreesClassifier\n",
    "from sklearn.ensemble import GradientBoostingClassifier\n",
    "from sklearn.neighbors import KNeighborsClassifier\n",
    "from sklearn.svm import SVC\n",
    "from sklearn import metrics  #accuracy_score,recall_score,f1_score\n",
    "from sklearn.metrics import classification_report\n",
    "from sklearn.metrics import precision_recall_fscore_support\n",
    "from sklearn.utils.multiclass import unique_labels\n",
    "from sklearn.metrics import accuracy_score\n",
    "from xgboost import XGBClassifier\n",
    "from sklearn.ensemble import GradientBoostingClassifier\n",
    "from sklearn.cross_validation import cross_val_score\n",
    "from lightgbm import LGBMClassifier\n",
    "from sklearn.model_selection import train_test_split, GridSearchCV, KFold\n",
    "from sklearn.svm import LinearSVC\n",
    "from sklearn import linear_model\n",
    "import lightgbm as lgb\n",
    "import xgboost as xgb\n",
    "from sklearn.model_selection import GridSearchCV\n",
    "\n",
    "from keras.models import Model\n",
    "from keras.layers import Dense, Input"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# encoding_dim = 600\n",
    "# input_dim = Input(shape=(364,))\n",
    "\n",
    "# encoded = Dense(364, activation='linear')(input_dim)\n",
    "# # encoded = Dense(300, activation='relu')(encoded)\n",
    "# # encoded = Dense(32, activation='relu')(encoded)\n",
    "# encoder_output = Dense(encoding_dim)(encoded)\n",
    "\n",
    "# decoded = Dense(600, activation='relu')(encoder_output)\n",
    "# # decoded = Dense(64, activation='relu')(decoded)\n",
    "# # decoded = Dense(128, activation='relu')(decoded)\n",
    "# decoded = Dense(364, activation='tanh')(decoded)\n",
    "\n",
    "# autoencoder = Model(inputs=input_dim, outputs=decoded)\n",
    "\n",
    "# encoder = Model(inputs=input_dim, outputs=encoder_output)\n",
    "\n",
    "# autoencoder.compile(optimizer='adam', loss='mse')\n",
    "# # training\n",
    "# autoencoder.fit(train_X.values, train_X.values, epochs=20, batch_size=150, shuffle=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# new_train_feature = encoder.predict(train_X.values)\n",
    "# new_test_feature = encoder.predict(test_X.values)\n",
    "# print(new_train_feature.shape)\n",
    "# print(new_test_feature.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "X_train,X_val,y_train,y_val= train_test_split(train_X,Y_train,test_size=0.2,random_state=2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "GridSearchCV(cv=5, error_score='raise',\n",
       "       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,\n",
       "              learning_rate=0.1, loss='deviance', max_depth=3,\n",
       "              max_features=None, max_leaf_nodes=None,\n",
       "              min_impurity_split=1e-07, min_samples_leaf=1,\n",
       "              min_samples_split=2, min_weight_fraction_leaf=0.0,\n",
       "              n_estimators=100, presort='auto', random_state=None,\n",
       "              subsample=1.0, verbose=0, warm_start=False),\n",
       "       fit_params={}, iid=True, n_jobs=1,\n",
       "       param_grid=[{'learning_rate': [0.01, 0.1], 'n_estimators': range(100, 300, 500), 'max_depth': range(4, 8, 12)}],\n",
       "       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,\n",
       "       scoring='roc_auc', verbose=0)"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tuned_parameters= [{'n_estimators':range(100,300,500),\n",
    "                  'max_depth':range(4,8,12),\n",
    "                  'learning_rate':[0.01, 0.1]\n",
    "                  }]\n",
    "clf = GridSearchCV(GradientBoostingClassifier(), tuned_parameters, cv=5, scoring='roc_auc')\n",
    "clf.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.804463759173\n"
     ]
    }
   ],
   "source": [
    "predictions = clf.predict_proba(X_val)\n",
    "pre = predictions[:,1]\n",
    "val_auc = metrics.roc_auc_score(y_val,pre)#验证集上的auc值\n",
    "print(val_auc)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(10000,)"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "preds = clf.predict_proba(test_X)\n",
    "pred = preds[:,1]\n",
    "pred.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "Submission = pd.DataFrame({'cust_id': test['cust_id'], 'pred_prob': pred})\n",
    "Submission.to_csv('../result/semi_gbdt.csv',index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "xgb = pd.read_csv('../result/semi_xgb4.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "lgb = pd.read_csv('../result/semi_lgb2.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "result = xgb.pred_prob*0.3 + lgb.pred_prob*0.7"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>cust_id</th>\n",
       "      <th>pred_prob</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>0.038582</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>0.087885</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>0.342310</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>0.213558</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>0.193331</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   cust_id  pred_prob\n",
       "0        1   0.038582\n",
       "1        2   0.087885\n",
       "2        3   0.342310\n",
       "3        4   0.213558\n",
       "4        5   0.193331"
      ]
     },
     "execution_count": 60,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "xgb.pred_prob = result\n",
    "xgb.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "xgb.to_csv('../result/semi_xgb_lgb1.csv',index= False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [default]",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
